## Loading required package: digest
## Loading required package: tibble
## Loading required package: ggplot2
## Project name: 01.protein-seq-evo-v1
## Loading project configuration
## Autoloading packages
## Loading package: dplyr
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading package: stringr
## Loading required package: stringr
## Loading package: readr
## Loading required package: readr
## Loading package: ggplot2
## Loading package: tidyr
## Loading required package: tidyr
## Loading package: patchwork
## Loading required package: patchwork
## Loading package: gridExtra
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading package: GGally
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading package: readxl
## Loading required package: readxl
## Loading package: viridis
## Loading required package: viridis
## Loading required package: viridisLite
## Loading package: cowplot
## Loading required package: cowplot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
## Loading package: ggsignif
## Loading required package: ggsignif
## Loading package: minpack.lm
## Loading required package: minpack.lm
## Loading package: purrr
## Loading required package: purrr
## Loading package: scales
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:readr':
##
## col_factor
## Loading package: bigsnpr
## Loading required package: bigsnpr
## Loading required package: bigstatsr
## Loading package: drc
## Loading required package: drc
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
##
## 'drc' has been loaded.
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
##
## Attaching package: 'drc'
## The following objects are masked from 'package:stats':
##
## gaussian, getInitial
## Loading package: data.table
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Autoloading helper functions
## Running helper script: globals.R
## Running helper script: helpers.R
## Autoloading data
## Munging data
## Running preprocessing script: 01-util.R
## Sourcing R script: 01-util.R
## Running preprocessing script: 02-munge_kras.R
## Sourcing R script: 02-munge_kras.R
## Rows: 189 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3591 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3572 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (11): column_coverage, popEVE, pop-adjusted_ESM1v, pop-adjusted_EVH_epis...
## lgl (3): pop-adjusted_EVE, pop-adjusted_Tranception, EVE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): wt_aa, mt_aa, ClinVar_ClinicalSignificance, Starred_Coarse_Grained...
## dbl (11): position, Gold_Stars, NumberSubmitters, frequency_gv2, frequency_g...
## lgl (14): BS1, PM2, PM5, PP5, BP6, b_model, p_model, b_acmg_model, lb_acmg_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2587 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): id, wt_aa, mt_aa, bp_interface, binding_RAF, binding_RAL, binding_...
## dbl (20): Pos_real, abundance_ddg, abundance_ddg_std, pik3cg_ddg, pik3cg_ddg...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3453 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_codon
## dbl (4): Pos_real, mean_kcal/mol, std_kcal/mol, ESM1v
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 03-munge_src.R
##
## Sourcing R script: 03-munge_src.R
##
## Rows: 5112 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): id
## dbl (8): FL_activity_mean_kcal/mol, FL_activity_std_kcal/mol, FL_folding_mea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10720 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): WT_AA, domains_Pfam, KD_lobe, secondary_structure_uniprot, seconda...
## dbl (1): position
## lgl (11): block1, block2, block3, block4, block5, R-spine, C-spine, Communit...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4898 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_aa
## dbl (9): FL_kinase_fitness_scaled, FL_kinase_sigma_scaled, FL_abundance_fitn...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 04-munge_psd95.R
##
## Sourcing R script: 04-munge_psd95.R
##
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl (1): binding_interface_5A
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 05-munge-grb2.R
##
## Sourcing R script: 05-munge-grb2.R
##
## Rows: 1056 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): id, old_id, id_ref, SS, Pos_class, protein, WT_AA, Mut, wt_aa.x, m...
## dbl (23): Pos_real, Pos_ref, Pos, mut_order, f_dg_pred, f_ddg_pred, f_ddg_pr...
## lgl (5): f_ddg_pred_conf, b_ddg_pred_conf, allosteric, orthosteric, alloste...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1689 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): id, protein, pca_type, aa_seq, old_id, wt_aa.x, mt_aa, wt_aa.y, at...
## dbl (14): Pos_real, Nham_aa, fitness, sigma, growthrate, growthrate_sigma, c...
## lgl (1): WT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pten_abundance <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/PTEN_HUMAN_Matreyek_2021.csv", header = TRUE)
pten_activity <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/PTEN_HUMAN_Mighell_2018.csv", header = TRUE)
pten_esm <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ESM1v/P60484.csv")
nrow(pten_abundance) #5083
## [1] 5083
nrow(pten_activity) #7260
## [1] 7260
nrow(pten_esm) #7657
## [1] 7657
pten_df <- merge(pten_abundance, pten_esm, by.x = "mutant", by.y = "variant")
pten_df <- pten_df %>% dplyr::select (mutant, DMS_score, DMS_score_bin, ESM.1v) %>%
dplyr::rename(DMS_score_abundance = DMS_score,
DMS_score_bin_abundance = DMS_score_bin)
pten_df <- merge(pten_df, pten_activity, by = "mutant")
pten_df <- pten_df %>% dplyr::rename(DMS_score_activity = DMS_score,
DMS_score_bin_activity = DMS_score_bin) %>%
dplyr::select (mutant, DMS_score_abundance, DMS_score_bin_abundance, ESM.1v,
DMS_score_activity, DMS_score_bin_activity)
pten_df <- pten_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
nrow(pten_df) #4839
## [1] 4839
pten_clinvar <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/P60484_clinvar_PTEN.csv")
nrow(pten_clinvar) #7657
## [1] 7657
head(pten_clinvar)
## variant V1 Model Dataset ddG_pred position wildtype
## 1 A120C 2381 ThermoMPNN AF-P60484-F1-model_v2 0.7477777 119 A
## 2 A120D 2382 ThermoMPNN AF-P60484-F1-model_v2 2.9353685 119 A
## 3 A120E 2383 ThermoMPNN AF-P60484-F1-model_v2 2.7981739 119 A
## 4 A120F 2384 ThermoMPNN AF-P60484-F1-model_v2 1.7522278 119 A
## 5 A120G 2385 ThermoMPNN AF-P60484-F1-model_v2 1.6180537 119 A
## 6 A120H 2386 ThermoMPNN AF-P60484-F1-model_v2 2.7476149 119 A
## mutation pdb chain new_position uniprot exposure_SS
## 1 C AF-P60484-F1-model_v2 A 120 P60484 E
## 2 D AF-P60484-F1-model_v2 A 120 P60484 E
## 3 E AF-P60484-F1-model_v2 A 120 P60484 E
## 4 F AF-P60484-F1-model_v2 A 120 P60484 E
## 5 G AF-P60484-F1-model_v2 A 120 P60484 E
## 6 H AF-P60484-F1-model_v2 A 120 P60484 E
## exposure_rASA spot_disorder func_esms_residue_class func_esms_variant_class
## 1 0 0 2 2
## 2 0 0 2 2
## 3 0 0 2 2
## 4 0 0 2 2
## 5 0 0 2 2
## 6 0 0 2 2
## clinvar_clinical_significance
## 1
## 2
## 3 conflict
## 4
## 5 VUS
## 6
## sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
pten_df <- merge(pten_df, pten_clinvar, by.x="mutant", by.y = "variant", all.x = TRUE)
nrow(pten_df) #4839
## [1] 4839
pten_var <- read_excel("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/taylor_supplement.xlsx", sheet = "Table S3")
head(pten_var)
## # A tibble: 6 × 34
## Subject_ID Nucleotide_Change Amino_Acid_Change AA_one_letter Age_Assessment
## <dbl> <chr> <chr> <chr> <dbl>
## 1 330 c.716T>C p.(Met239Thr) M239T 26
## 2 370 c.395G>A p.(Gly132Asp) G132D 63
## 3 435 c.406T>C p.(Cys136Arg) C136R 52
## 4 522 c.103A>G p.(Met35Val) M35V 35
## 5 897 c.284C>T p.(Pro95Leu) P95L 17
## 6 1005 c.144C>A p.(Asn48Lys) N48K 39
## # ℹ 29 more variables: Sex <chr>, CC_Score <chr>, Fitness_Score <dbl>,
## # Abundance_Score <dbl>, Phenotype_List <chr>, Phenotype_Class <chr>,
## # ASD <lgl>, PHTS <lgl>, Age_Head_Measure <dbl>, `Head_Measure(cm)` <dbl>,
## # Head_size_expected <dbl>, Head_size_expected_SD <dbl>, Head_z_score <dbl>,
## # CADD_Score <dbl>, Fitness_score_coded <dbl>, Abundance_score_coded <dbl>,
## # Quadrant <dbl>, Thyroid_Age_Dx <chr>, Breast_1_Age_Dx <chr>,
## # Breast_2_Age_Dx <chr>, Melanoma_Age_Dx <chr>, Colorec_Age_Dx <chr>, …
nrow(pten_var) #145
## [1] 145
pten_var <- pten_var %>% dplyr::select(AA_one_letter, Phenotype_Class, Phenotype_List, Fitness_Score, Abundance_Score)
pten_df <- merge(pten_df, pten_var, by.x="mutant", by.y = "AA_one_letter", all.x = TRUE)
nrow(pten_df) #4873
## [1] 4873
pten_gnomad <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_gnomad/PTEN_gnomAD_v4.1.0_ENST00000371953_2025_04_08_14_29_46.csv")
pten_gnomad <- pten_gnomad %>% filter(VEP.Annotation == "missense_variant")
summary(pten_gnomad$Allele.Frequency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.195e-07 6.197e-07 6.211e-07 2.946e-06 1.859e-06 1.289e-04
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.195e-07 6.197e-07 6.211e-07 2.946e-06 1.859e-06 1.289e-04
pten_gnomad <- pten_gnomad %>% dplyr::select(HGVS.Consequence, ClinVar.Germline.Classification,
rsIDs, ClinVar.Variation.ID)
# Apply to the column
aa_map <- c(
Ala="A", Arg="R", Asn="N", Asp="D", Cys="C",
Gln="Q", Glu="E", Gly="G", His="H", Ile="I",
Leu="L", Lys="K", Met="M", Phe="F", Pro="P",
Ser="S", Thr="T", Trp="W", Tyr="Y", Val="V",
Ter="*"
)
convert_to_short <- function(consequence) {
consequence <- gsub("^p\\.", "", consequence) # Remove 'p.'
matches <- regmatches(consequence, regexec("([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", consequence))[[1]]
if (length(matches) == 4) {
from <- aa_map[[matches[2]]]
pos <- matches[3]
to <- aa_map[[matches[4]]]
return(paste0(from, pos, to))
} else {
return(NA)
}
}
pten_gnomad$ID <- sapply(pten_gnomad$HGVS.Consequence, convert_to_short)
head(pten_gnomad)
## HGVS.Consequence ClinVar.Germline.Classification rsIDs
## 1 p.Ile5Met rs2132145458
## 2 p.Val9Asp rs2132145566
## 3 p.Ser10Gly Uncertain significance rs572685299
## 4 p.Tyr16His Uncertain significance rs1064796078
## 5 p.Asp19Gly Uncertain significance rs1554890392
## 6 p.Asp22Gly Conflicting classifications of pathogenicity rs1554890398
## ClinVar.Variation.ID ID
## 1 NA I5M
## 2 NA V9D
## 3 428272 S10G
## 4 428195 Y16H
## 5 492333 D19G
## 6 486221 D22G
nrow(pten_gnomad) #235
## [1] 235
length(unique(pten_gnomad$ID))
## [1] 235
pten_gnomad$gnomad <- "gnomad"
pten_df <- merge(pten_df, pten_gnomad, by.x="mutant", by.y = "ID", all.x = TRUE)
nrow(pten_df) #4873
## [1] 4873
head(pten_df)
## mutant DMS_score_abundance DMS_score_bin_abundance ESM.1v
## 1 A120E -0.075192482 0 -16.59294
## 2 A120F 0.449809072 0 -16.11391
## 3 A120G 0.639477479 0 -10.35606
## 4 A120L 0.023241040 0 -14.00481
## 5 A120P -0.004816027 0 -14.67169
## 6 A120Q 0.108151133 0 -19.15990
## DMS_score_activity DMS_score_bin_activity mutation_position V1 Model
## 1 -2.0938569 0 120 2383 ThermoMPNN
## 2 -3.4270689 0 120 2384 ThermoMPNN
## 3 -0.0142132 1 120 2385 ThermoMPNN
## 4 -3.4214305 0 120 2389 ThermoMPNN
## 5 -3.4173247 0 120 2392 ThermoMPNN
## 6 -4.5183149 0 120 2393 ThermoMPNN
## Dataset ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 2.798174 119 A E
## 2 AF-P60484-F1-model_v2 1.752228 119 A F
## 3 AF-P60484-F1-model_v2 1.618054 119 A G
## 4 AF-P60484-F1-model_v2 1.529649 119 A L
## 5 AF-P60484-F1-model_v2 2.719771 119 A P
## 6 AF-P60484-F1-model_v2 2.936961 119 A Q
## pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2 A 120 P60484 E 0
## 2 AF-P60484-F1-model_v2 A 120 P60484 E 0
## 3 AF-P60484-F1-model_v2 A 120 P60484 E 0
## 4 AF-P60484-F1-model_v2 A 120 P60484 E 0
## 5 AF-P60484-F1-model_v2 A 120 P60484 E 0
## 6 AF-P60484-F1-model_v2 A 120 P60484 E 0
## spot_disorder func_esms_residue_class func_esms_variant_class
## 1 0 2 2
## 2 0 2 2
## 3 0 2 2
## 4 0 2 2
## 5 0 2 2
## 6 0 2 2
## clinvar_clinical_significance
## 1 conflict
## 2
## 3 VUS
## 4
## 5
## 6
## sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## Phenotype_Class
## 1 PHTS
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## Phenotype_List
## 1 Adenomatous Polyps|Anemia|Benign Neurologic Neoplasms|Gastroesophageal Reflux Disease|Hamartomatous Polyp|Hemangioma of Skin|Inflammatory Polyp|Lipoma|Macrocephaly|Malignant Neoplasm of Prostate|Polyp Hyperplastic Benign
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## Fitness_Score Abundance_Score HGVS.Consequence
## 1 -2.093857 -0.07519248 <NA>
## 2 NA NA <NA>
## 3 NA NA <NA>
## 4 NA NA <NA>
## 5 NA NA <NA>
## 6 NA NA <NA>
## ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad
## 1 <NA> <NA> NA <NA>
## 2 <NA> <NA> NA <NA>
## 3 <NA> <NA> NA <NA>
## 4 <NA> <NA> NA <NA>
## 5 <NA> <NA> NA <NA>
## 6 <NA> <NA> NA <NA>
all_positions <- c(
88:98, # WPD loop
160:171, # TI loop
123:130, # P loop
35:49, # Arginine loop
200:212, # CBR1
226:238, # CBR2
258:268, # CBR3
327:335, # Cα2 loop
151:174, # membrane-binding alpha-helix
1:15 # PBM motif
)
length(pten_df[is.na(pten_df$DMS_score_abundance),] %>% pull(mutant)) #
## [1] 0
pten_df <- pten_df[!is.na(pten_df$DMS_score_abundance), ]
nrow(pten_df) #4873
## [1] 4873
fil_pten_df <- pten_df %>%
filter(!mutation_position %in% all_positions) %>% distinct(mutant, .keep_all = TRUE)
nrow(fil_pten_df) #3454
## [1] 3454
# Fit a loess model using the filtered data
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_pten_df, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model trained on fil_pten_df
pten_df$fitted <- predict(loess_fit, newdata = pten_df)
# Calculate residuals for ALL points
pten_df$residuals <- pten_df$DMS_score_activity - pten_df$fitted
range(pten_df$residuals) #-5.157748 4.754293
## [1] -5.157748 4.754293
fit_line_df <- data.frame(
DMS_score_abundance = seq(min(fil_pten_df$DMS_score_abundance, na.rm = TRUE),
max(fil_pten_df$DMS_score_abundance, na.rm = TRUE),
length.out = 200)
)
fit_line_df$DMS_score_activity <- predict(loess_fit, newdata = fit_line_df)
range(pten_df$residuals)
## [1] -5.157748 4.754293
range(pten_df$DMS_score_abundance)
## [1] -0.2776983 1.6652985
range(pten_df$DMS_score_activity)
## [1] -5.754822 2.817851
pten_df <- pten_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
nrow(pten_df)
## [1] 4873
median_residuals <- pten_df %>%
group_by(mutation_position) %>%
summarise(median_residuals = median(residuals, na.rm = TRUE))
min(median_residuals$median_residuals) #-1.861488
## [1] -4.034455
max(median_residuals$median_residuals) #4.034455
## [1] 1.861488
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r.pdb")
data <- median_residuals
head(data)
## # A tibble: 6 × 2
## mutation_position median_residuals
## <dbl> <dbl>
## 1 2 0.728
## 2 3 1.34
## 3 5 -0.224
## 4 6 -1.19
## 5 7 0.579
## 6 9 -0.625
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b
# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
position <- data$mutation_position[i]
correlation_value <- data$median_residuals[i]
# Find indices in the PDB that match the current position
indices <- which(pdb$atom$resno == position)
# Print the indices and current B-factors before updating
#cat("Updating residue number:", position, "\n")
#cat("Indices in PDB:", indices, "\n")
#cat("Current B-factors:", new_b_factors[indices], "\n")
# Update B-factors for all atoms in the current residue
new_b_factors[indices] <- correlation_value
# Print the new B-factors after updating
#cat("Updated B-factors:", new_b_factors[indices], "\n")
#cat("\n") # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999
# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r_median_residual_loess.pdb")
pten_soma <- read.delim("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_somatic/All_cBioportal_nonredundant_210417.tsv", header = TRUE, sep = "\t")
pten_soma <- pten_soma %>%
filter(!grepl("\\*", Protein.Change))
pten_soma$somatic <- "Somatic"
head(pten_soma)
## Study Sample.ID
## 1 Acral Melanoma (TGEN, Genome Res 2017) 148336T-25b
## 2 Non-Small Cell Cancer (MSKCC, Cancer Discov 2017) P-0003863-T01-IM5
## 3 Non-Small Cell Cancer (MSKCC, Cancer Discov 2017) P-0009431-T01-IM5
## 4 Uterine Sarcoma/Mesenchymal (MSK, Clin Cancer Res 2020) P-0001821-T01-IM3
## 5 Pan-Lung Cancer (TCGA, Nat Genet 2016) TCGA-39-5027-01
## 6 Pan-Lung Cancer (TCGA, Nat Genet 2016) TCGA-21-5783-01
## Cancer.Type Protein.Change
## 1 R130G
## 2 R130Q
## 3 R130G
## 4 R130Q
## 5 R130Q
## 6 R130Q
## Annotation
## 1 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 2 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 3 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## Functional.Impact
## 1 MutationAssessor: impact: high, score: 4.165;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 2 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 3 MutationAssessor: impact: high, score: 4.165;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 5 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 6 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## Mutation.Type Variant.Type Copy.. COSMIC MS VS
## 1 Missense_Mutation SNP diploid 222 <NA> <NA>
## 2 Missense_Mutation SNP diploid 222 SOMATIC <NA>
## 3 Missense_Mutation SNP diploid 222 SOMATIC <NA>
## 4 Missense_Mutation SNP diploid 222 SOMATIC Unknown
## 5 Missense_Mutation SNP shallowdel 222 Somatic <NA>
## 6 Missense_Mutation SNP shallowdel 222 Somatic <NA>
## Center Chromosome Start.Pos End.Pos Ref Var HGVSg
## 1 . 10 89692904 89692904 C G 10:g.89692904C>G
## 2 MSKCC 10 89692905 89692905 G A 10:g.89692905G>A
## 3 MSKCC 10 89692904 89692904 C G 10:g.89692904C>G
## 4 MSKCC 10 89692905 89692905 G A 10:g.89692905G>A
## 5 broad.mit.edu 10 89692905 89692905 G A 10:g.89692905G>A
## 6 broad.mit.edu 10 89692905 89692905 G A 10:g.89692905G>A
## HGVSc Allele.Freq..T. Allele.Freq..N. Variant.Reads
## 1 ENST00000371953.3:c.388C>G NA NA NA
## 2 ENST00000371953.3:c.389G>A NA NA NA
## 3 ENST00000371953.3:c.388C>G NA NA NA
## 4 ENST00000371953.3:c.389G>A 0.35 NA 174
## 5 ENST00000371953.3:c.389G>A 0.23 NA 17
## 6 ENST00000371953.3:c.389G>A 0.42 NA 39
## Ref.Reads Variant.Reads..Normal. Ref.Reads..Normal. X..Mut.in.Sample Exon
## 1 NA NA NA 357 5/9
## 2 NA NA NA 8 5/9
## 3 NA NA NA 9 5/9
## 4 318 NA 275 65 5/9
## 5 56 NA 81 295 5/9
## 6 53 NA 186 363 5/9
## gnomAD ClinVar
## 1 NA Likely pathogenic, Pathogenic
## 2 NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 3 NA Likely pathogenic, Pathogenic
## 4 NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 5 NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 6 NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## dbSNP somatic
## 1 rs121909224 Somatic
## 2 rs121909229 Somatic
## 3 rs121909224 Somatic
## 4 rs121909229 Somatic
## 5 rs121909229 Somatic
## 6 rs121909229 Somatic
nrow(pten_soma) #2610
## [1] 2610
table(pten_soma$Annotation)
##
## OncoKB: Inconclusive, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no
## 2
## OncoKB: Inconclusive, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 6
## OncoKB: Likely Neutral, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no
## 2
## OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 161
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no
## 309
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes
## 83
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no
## 358
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 274
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: no
## 2
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: yes
## 14
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: no
## 16
## OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 11
## OncoKB: Likely Oncogenic, level_4;CIViC: Oncogenic: 1, Diagnostic: 2, Predisposing: 2;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 48
## OncoKB: Oncogenic, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 1
## OncoKB: Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes
## 13
## OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no
## 89
## OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes
## 53
## OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no
## 43
## OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 203
## OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 103
## OncoKB: Predicted Oncogenic, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no
## 1
## OncoKB: Predicted Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no
## 36
## OncoKB: Predicted Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 98
## OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no
## 588
## OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes
## 92
## OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 2
## OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: no
## 2
pten_soma_hotspot <- pten_soma[grepl("CancerHotspot: yes", pten_soma$Annotation), ]
pten_soma_hotspot_unique <- pten_soma_hotspot %>% distinct(Protein.Change, .keep_all = TRUE)
nrow(pten_soma_hotspot_unique) #144
## [1] 144
nrow(pten_df) #4873
## [1] 4873
pten_df_clean <- pten_df %>% distinct(mutant, .keep_all = TRUE)
nrow(pten_df_clean) #4839
## [1] 4839
length(intersect(pten_soma_hotspot$Protein.Change, pten_df_clean$mutant)) #93
## [1] 93
length(setdiff(pten_soma_hotspot$Protein.Change, pten_df_clean$mutant)) #51
## [1] 51
pten_df_clean$somatic <- ifelse(
pten_df_clean$mutant %in% pten_soma_hotspot$Protein.Change,
"Somatic",
NA
)
table(pten_df_clean$somatic)
##
## Somatic
## 93
pten_df <- merge(pten_df, pten_soma_hotspot, by.x="mutant", by.y = "Protein.Change", all.x = TRUE)
nrow(pten_df)
## [1] 6674
pten_df <- pten_df %>% distinct(mutant, .keep_all = TRUE)
table(pten_df$somatic)
##
## Somatic
## 93
pten_gnomad <- pten_df %>% filter(!is.na(HGVS.Consequence)) %>%
filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele", "VUS")) %>%
filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
"Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
"Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
"Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
filter(is.na(Phenotype_Class)) %>%
filter(is.na(somatic))
nrow(pten_gnomad) #37
## [1] 37
length(unique(pten_gnomad$mutant)) #37
## [1] 37
pten_gnomad$var_source <- "gnomAD"
pten_somatic <- merge(pten_df, pten_soma_hotspot, by.x="mutant", by.y = "Protein.Change")
nrow(pten_somatic) #
## [1] 788
length(unique(pten_somatic$mutant)) #93
## [1] 93
pten_somatic <- pten_somatic %>% distinct(mutant, .keep_all = TRUE)
head(pten_somatic)
## mutant DMS_score_abundance DMS_score_bin_abundance ESM.1v
## 1 A126G 1.1031274 1 -10.59961
## 2 A126S 1.1589273 1 -13.37786
## 3 A126T 0.9698197 1 -14.54880
## 4 C105G 0.3516079 0 -13.94637
## 5 C105R 0.1588046 0 -13.99142
## 6 C105S 0.2445279 0 -10.05558
## DMS_score_activity DMS_score_bin_activity mutation_position V1 Model
## 1 -0.82914446 1 126 2505 ThermoMPNN
## 2 0.06133791 1 126 2515 ThermoMPNN
## 3 -1.93558841 0 126 2516 ThermoMPNN
## 4 -4.83784503 0 105 2085 ThermoMPNN
## 5 -3.78634247 0 105 2094 ThermoMPNN
## 6 -1.05959009 1 105 2095 ThermoMPNN
## Dataset ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 0.6510435 125 A G
## 2 AF-P60484-F1-model_v2 0.2785859 125 A S
## 3 AF-P60484-F1-model_v2 0.4760755 125 A T
## 4 AF-P60484-F1-model_v2 1.0596524 104 C G
## 5 AF-P60484-F1-model_v2 1.5254858 104 C R
## 6 AF-P60484-F1-model_v2 0.4113229 104 C S
## pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2 A 126 P60484 S 0.33
## 2 AF-P60484-F1-model_v2 A 126 P60484 S 0.33
## 3 AF-P60484-F1-model_v2 A 126 P60484 S 0.33
## 4 AF-P60484-F1-model_v2 A 105 P60484 H 0.00
## 5 AF-P60484-F1-model_v2 A 105 P60484 H 0.00
## 6 AF-P60484-F1-model_v2 A 105 P60484 H 0.00
## spot_disorder func_esms_residue_class func_esms_variant_class
## 1 0 1 1
## 2 0 1 1
## 3 0 1 1
## 4 0 2 2
## 5 0 2 2
## 6 0 2 1
## clinvar_clinical_significance
## 1 VUS
## 2 VUS
## 3 likely_pathogenic
## 4 likely_pathogenic
## 5
## 6
## sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## Phenotype_Class Phenotype_List Fitness_Score Abundance_Score HGVS.Consequence
## 1 <NA> <NA> NA NA <NA>
## 2 <NA> <NA> NA NA <NA>
## 3 <NA> <NA> NA NA <NA>
## 4 <NA> <NA> NA NA p.Cys105Gly
## 5 <NA> <NA> NA NA <NA>
## 6 <NA> <NA> NA NA <NA>
## ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad
## 1 <NA> <NA> NA <NA>
## 2 <NA> <NA> NA <NA>
## 3 <NA> <NA> NA <NA>
## 4 Likely pathogenic rs1303165645 1728081 gnomad
## 5 <NA> <NA> NA <NA>
## 6 <NA> <NA> NA <NA>
## fitted residuals
## 1 -0.1823781 -0.6467664
## 2 -0.1908700 0.2522079
## 3 -0.1785532 -1.7570352
## 4 -1.3357357 -3.5021093
## 5 -2.3737286 -1.4126139
## 6 -1.9276639 0.8680738
## Study.x
## 1 Breast Invasive Carcinoma (TCGA, PanCancer Atlas)
## 2 Breast Cancer (MSK, Cancer Cell 2018)
## 3 Uterine Corpus Endometrial Carcinoma (TCGA, PanCancer Atlas)
## 4 Melanomas (TCGA, Cell 2015)
## 5 Breast Cancer (METABRIC, Nature 2012 & Nat Commun 2016)
## 6 MSK-IMPACT Clinical Sequencing Cohort (MSKCC, Nat Med 2017)
## Sample.ID.x Cancer.Type.x
## 1 TCGA-LL-A5YP-01
## 2 P-0000422-T02-IM3
## 3 TCGA-EY-A1GU-01
## 4 TCGA-D3-A2J8-06
## 5 MB-6143
## 6 P-0002672-T01-IM3 Gastrointestinal Stromal Tumor
## Annotation.x
## 1 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 2 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 3 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## Functional.Impact.x
## 1 MutationAssessor: impact: low, score: 1.885;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 2 MutationAssessor: impact: medium, score: 3.39;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 3 MutationAssessor: impact: medium, score: 2.9;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: medium, score: 3.465;SIFT: impact: deleterious, score: 0.01;Polyphen-2: impact: probably_damaging, score: 0.998
## 5 MutationAssessor: impact: high, score: 3.81;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 6 MutationAssessor: impact: medium, score: 2.53;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## Mutation.Type.x Variant.Type.x Copy...x COSMIC.x MS.x VS.x
## 1 Missense_Mutation SNP 18 . .
## 2 Missense_Mutation SNP 18 UNKNOWN Unknown
## 3 Missense_Mutation SNP 18 . .
## 4 Missense_Mutation SNP 21 Somatic <NA>
## 5 Missense_Mutation SNP 21 <NA> <NA>
## 6 Missense_Mutation SNP diploid 21 <NA> <NA>
## Center.x Chromosome.x Start.Pos.x End.Pos.x Ref.x Var.x HGVSg.x
## 1 . 10 89692893 89692893 C G 10:g.89692893C>G
## 2 MSKCC 10 89692892 89692892 G T 10:g.89692892G>T
## 3 . 10 89692892 89692892 G A 10:g.89692892G>A
## 4 broad.mit.edu 10 89692829 89692829 T G 10:g.89692829T>G
## 5 METABRIC 10 89692829 89692829 T C 10:g.89692829T>C
## 6 <NA> 10 89692829 89692829 T A 10:g.89692829T>A
## HGVSc.x Allele.Freq..T..x Allele.Freq..N..x
## 1 ENST00000371953.3:c.377C>G 0.50 NA
## 2 ENST00000371953.3:c.376G>T 0.48 0.0036
## 3 ENST00000371953.3:c.376G>A 0.46 NA
## 4 ENST00000371953.3:c.313T>G 0.16 NA
## 5 ENST00000371953.3:c.313T>C NA NA
## 6 ENST00000371953.3:c.313T>A 0.32 NA
## Variant.Reads.x Ref.Reads.x Variant.Reads..Normal..x Ref.Reads..Normal..x
## 1 27 27 NA 82
## 2 105 115 1 275
## 3 43 51 NA 68
## 4 11 56 NA NA
## 5 NA NA NA NA
## 6 44 93 NA NA
## X..Mut.in.Sample.x Exon.x gnomAD.x ClinVar.x
## 1 64 5/9 NA
## 2 7 5/9 NA Uncertain significance
## 3 1179 5/9 NA Uncertain significance, Likely pathogenic
## 4 507 5/9 NA
## 5 6 5/9 NA
## 6 16 5/9 NA
## dbSNP.x somatic.x
## 1 Somatic
## 2 rs1554898129 Somatic
## 3 rs1554898129 Somatic
## 4 Somatic
## 5 Somatic
## 6 Somatic
## Study.y
## 1 Breast Invasive Carcinoma (TCGA, PanCancer Atlas)
## 2 Breast Cancer (MSK, Cancer Cell 2018)
## 3 Uterine Corpus Endometrial Carcinoma (TCGA, PanCancer Atlas)
## 4 Melanomas (TCGA, Cell 2015)
## 5 Breast Cancer (METABRIC, Nature 2012 & Nat Commun 2016)
## 6 MSK-IMPACT Clinical Sequencing Cohort (MSKCC, Nat Med 2017)
## Sample.ID.y Cancer.Type.y
## 1 TCGA-LL-A5YP-01
## 2 P-0000422-T02-IM3
## 3 TCGA-EY-A1GU-01
## 4 TCGA-D3-A2J8-06
## 5 MB-6143
## 6 P-0002672-T01-IM3 Gastrointestinal Stromal Tumor
## Annotation.y
## 1 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 2 OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 3 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## Functional.Impact.y
## 1 MutationAssessor: impact: low, score: 1.885;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 2 MutationAssessor: impact: medium, score: 3.39;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 3 MutationAssessor: impact: medium, score: 2.9;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: medium, score: 3.465;SIFT: impact: deleterious, score: 0.01;Polyphen-2: impact: probably_damaging, score: 0.998
## 5 MutationAssessor: impact: high, score: 3.81;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 6 MutationAssessor: impact: medium, score: 2.53;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## Mutation.Type.y Variant.Type.y Copy...y COSMIC.y MS.y VS.y
## 1 Missense_Mutation SNP 18 . .
## 2 Missense_Mutation SNP 18 UNKNOWN Unknown
## 3 Missense_Mutation SNP 18 . .
## 4 Missense_Mutation SNP 21 Somatic <NA>
## 5 Missense_Mutation SNP 21 <NA> <NA>
## 6 Missense_Mutation SNP diploid 21 <NA> <NA>
## Center.y Chromosome.y Start.Pos.y End.Pos.y Ref.y Var.y HGVSg.y
## 1 . 10 89692893 89692893 C G 10:g.89692893C>G
## 2 MSKCC 10 89692892 89692892 G T 10:g.89692892G>T
## 3 . 10 89692892 89692892 G A 10:g.89692892G>A
## 4 broad.mit.edu 10 89692829 89692829 T G 10:g.89692829T>G
## 5 METABRIC 10 89692829 89692829 T C 10:g.89692829T>C
## 6 <NA> 10 89692829 89692829 T A 10:g.89692829T>A
## HGVSc.y Allele.Freq..T..y Allele.Freq..N..y
## 1 ENST00000371953.3:c.377C>G 0.50 NA
## 2 ENST00000371953.3:c.376G>T 0.48 0.0036
## 3 ENST00000371953.3:c.376G>A 0.46 NA
## 4 ENST00000371953.3:c.313T>G 0.16 NA
## 5 ENST00000371953.3:c.313T>C NA NA
## 6 ENST00000371953.3:c.313T>A 0.32 NA
## Variant.Reads.y Ref.Reads.y Variant.Reads..Normal..y Ref.Reads..Normal..y
## 1 27 27 NA 82
## 2 105 115 1 275
## 3 43 51 NA 68
## 4 11 56 NA NA
## 5 NA NA NA NA
## 6 44 93 NA NA
## X..Mut.in.Sample.y Exon.y gnomAD.y ClinVar.y
## 1 64 5/9 NA
## 2 7 5/9 NA Uncertain significance
## 3 1179 5/9 NA Uncertain significance, Likely pathogenic
## 4 507 5/9 NA
## 5 6 5/9 NA
## 6 16 5/9 NA
## dbSNP.y somatic.y
## 1 Somatic
## 2 rs1554898129 Somatic
## 3 rs1554898129 Somatic
## 4 Somatic
## 5 Somatic
## 6 Somatic
pten_somatic <- pten_somatic %>%
filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele")) %>%
filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
"Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
"Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
"Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
filter(is.na(Phenotype_Class))
nrow(pten_somatic) #37
## [1] 37
pten_somatic$var_source <- "Somatic"
colnames(pten_somatic)
## [1] "mutant" "DMS_score_abundance"
## [3] "DMS_score_bin_abundance" "ESM.1v"
## [5] "DMS_score_activity" "DMS_score_bin_activity"
## [7] "mutation_position" "V1"
## [9] "Model" "Dataset"
## [11] "ddG_pred" "position"
## [13] "wildtype" "mutation"
## [15] "pdb" "chain"
## [17] "new_position" "uniprot"
## [19] "exposure_SS" "exposure_rASA"
## [21] "spot_disorder" "func_esms_residue_class"
## [23] "func_esms_variant_class" "clinvar_clinical_significance"
## [25] "sequence" "Phenotype_Class"
## [27] "Phenotype_List" "Fitness_Score"
## [29] "Abundance_Score" "HGVS.Consequence"
## [31] "ClinVar.Germline.Classification" "rsIDs"
## [33] "ClinVar.Variation.ID" "gnomad"
## [35] "fitted" "residuals"
## [37] "Study.x" "Sample.ID.x"
## [39] "Cancer.Type.x" "Annotation.x"
## [41] "Functional.Impact.x" "Mutation.Type.x"
## [43] "Variant.Type.x" "Copy...x"
## [45] "COSMIC.x" "MS.x"
## [47] "VS.x" "Center.x"
## [49] "Chromosome.x" "Start.Pos.x"
## [51] "End.Pos.x" "Ref.x"
## [53] "Var.x" "HGVSg.x"
## [55] "HGVSc.x" "Allele.Freq..T..x"
## [57] "Allele.Freq..N..x" "Variant.Reads.x"
## [59] "Ref.Reads.x" "Variant.Reads..Normal..x"
## [61] "Ref.Reads..Normal..x" "X..Mut.in.Sample.x"
## [63] "Exon.x" "gnomAD.x"
## [65] "ClinVar.x" "dbSNP.x"
## [67] "somatic.x" "Study.y"
## [69] "Sample.ID.y" "Cancer.Type.y"
## [71] "Annotation.y" "Functional.Impact.y"
## [73] "Mutation.Type.y" "Variant.Type.y"
## [75] "Copy...y" "COSMIC.y"
## [77] "MS.y" "VS.y"
## [79] "Center.y" "Chromosome.y"
## [81] "Start.Pos.y" "End.Pos.y"
## [83] "Ref.y" "Var.y"
## [85] "HGVSg.y" "HGVSc.y"
## [87] "Allele.Freq..T..y" "Allele.Freq..N..y"
## [89] "Variant.Reads.y" "Ref.Reads.y"
## [91] "Variant.Reads..Normal..y" "Ref.Reads..Normal..y"
## [93] "X..Mut.in.Sample.y" "Exon.y"
## [95] "gnomAD.y" "ClinVar.y"
## [97] "dbSNP.y" "somatic.y"
## [99] "var_source"
pten_patho <- pten_df %>% filter(clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic"))
nrow(pten_patho) #
## [1] 115
length(unique(pten_patho$mutant)) #115
## [1] 115
pten_patho <- pten_patho %>% distinct(mutant, .keep_all = TRUE)
nrow(pten_patho) #115
## [1] 115
pten_patho$var_source <- "Pathogenic"
p1 <- ggplot(pten_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
shape = 17, color = "black", size = 3) + # Triangle shape
labs(
title = "All 4839 Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Residuals"
) +
theme_classic() +
ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
scale_color_viridis(option = "C", direction = 1, limits = c(-4.8, 5.2)) +
theme(legend.position = "none")
# Add marginal density plots
p1 <- ggMarginal(
p1,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
nrow(pten_gnomad)
## [1] 37
p2 <- ggplot(pten_gnomad, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2) +
geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
labs(
title = "gnomAD Variants",
x = "Abundance",
y = "Activity",
color = "Loess residuals"
) +
theme_classic() +
ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
scale_color_viridis(option = "C", direction = 1, limits = c(-4.8, 5.2)) +
theme(legend.position = "none") +
geom_text_repel(data = subset(pten_df, mutant %in% c("A79T")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(pten_df, mutant %in% c("A79T")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
color = "black", size = 2)
# Add marginal density plots
p2 <- ggMarginal(
p2,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
nrow(pten_patho)
## [1] 115
p3 <- ggplot(pten_patho, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2, shape = 17) +
geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
shape = 17, color = "black", size = 3) + # Triangle shape
labs(
title = "ClinVar Pathogenic Variants",
x = "Abundance",
y = "Activity",
color = "Loess residuals"
) +
theme_classic() +
ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
scale_color_viridis(option = "C", direction = 1, limits = c(-4.8, 5.2)) +
theme(legend.position = "none")
# Add marginal density plots
p3 <- ggMarginal(
p3,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
p4 <- plot_grid(p1,p2,p3,ncol=3, nrow=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_scatter1.pdf",
plot = p4, width = 9, height = 3, dpi = 300)
p4

ggplot(pten_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
shape = 17, color = "black", size = 3) + # Triangle shape
labs(
title = "All 4839 Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Residuals"
) +
theme_classic() +
ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
scale_color_viridis(option = "C", direction = 1, limits = c(-4.8, 5.2)) +
theme(legend.position = "bottom")

# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.05
n_patho <- nrow(pten_patho)
# 2. Get abundance/residuals from pten_patho
patho_df <- pten_patho %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- pten_df %>%
filter(!(mutant %in% pten_patho$mutant)) %>%
filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)
)
# Combine with patho residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
)
label_df <- label_df %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 4.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residuals",
title = ""
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin1.pdf",
plot = p_fast, width = 3, height = 4, dpi = 300)
p_fast

patho_df <- pten_df_clean %>% distinct(mutant, Phenotype_Class, .keep_all = TRUE)
nrow(patho_df) #4839
## [1] 4839
patho_df1 <- patho_df %>% filter(Phenotype_Class == "ASD/DD")
patho_df2 <- patho_df %>% filter(Phenotype_Class == "ASD/DD & PHTS")
patho_df3 <- patho_df %>% filter(Phenotype_Class == "PHTS")
patho_df1$var_class2 <- "ASD/DD"
patho_df2$var_class2 <- "ASD/DD & PHTS"
patho_df3$var_class2 <- "PHTS"
pten_gnomad$var_class2 <- "gnomAD"
pten_somatic$var_class2 <- "somatic"
colnames(patho_df1)
## [1] "mutant" "DMS_score_abundance"
## [3] "DMS_score_bin_abundance" "ESM.1v"
## [5] "DMS_score_activity" "DMS_score_bin_activity"
## [7] "mutation_position" "V1"
## [9] "Model" "Dataset"
## [11] "ddG_pred" "position"
## [13] "wildtype" "mutation"
## [15] "pdb" "chain"
## [17] "new_position" "uniprot"
## [19] "exposure_SS" "exposure_rASA"
## [21] "spot_disorder" "func_esms_residue_class"
## [23] "func_esms_variant_class" "clinvar_clinical_significance"
## [25] "sequence" "Phenotype_Class"
## [27] "Phenotype_List" "Fitness_Score"
## [29] "Abundance_Score" "HGVS.Consequence"
## [31] "ClinVar.Germline.Classification" "rsIDs"
## [33] "ClinVar.Variation.ID" "gnomad"
## [35] "fitted" "residuals"
## [37] "somatic" "var_class2"
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2, residuals)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
patho_df3 <- patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
pten_somatic <- pten_somatic %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
combined_df <- rbind(patho_df1,patho_df2,patho_df3,pten_gnomad, pten_somatic)
median_df <- combined_df %>%
group_by(var_class2) %>%
summarise(
median_residual = median(residuals),
n = n()
)
median_df
## # A tibble: 5 × 3
## var_class2 median_residual n
## <chr> <dbl> <int>
## 1 ASD/DD 0.162 12
## 2 ASD/DD & PHTS -1.48 10
## 3 PHTS -1.77 36
## 4 gnomAD -0.00905 37
## 5 somatic -1.52 37
#wilcox.test(patho_df1$residuals, pten_gnomad$residuals)
combined_df$var_class2 <- factor(
combined_df$var_class2,
levels = c("gnomAD", "somatic","ASD/DD", "ASD/DD & PHTS", "PHTS")
)
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green
"somatic" = "#fbb4ae",
"ASD/DD" = "#f4a6b3", # Warm orange
"ASD/DD & PHTS" = "#e7298a", # Muted purple
"PHTS" = "#7570b3" # Hot pink / magenta
)
highlight_muts <- c("C124S", "G129E", "R130G", "R130Q", "P38S", "R130P", "D92H")
p10 <- ggplot(combined_df, aes(x = var_class2, y = residuals, fill = var_class2)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_class2, y = median_residual + 0.5, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_class2, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "Across groups",
x = "",
y = "Activity-abundance residual",
fill = ""
) +
theme_classic(base_size = 14) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 20, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "somatic"),
c("gnomAD", "ASD/DD"),
c("gnomAD", "ASD/DD & PHTS"),
c("gnomAD", "PHTS")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin2.pdf",
plot = p10, width = 3, height = 4, dpi = 300)
p10

wilcox.test(patho_df1$residuals, pten_gnomad$residuals) #0.688
##
## Wilcoxon rank sum exact test
##
## data: patho_df1$residuals and pten_gnomad$residuals
## W = 235, p-value = 0.7743
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, pten_gnomad$residuals) #0.03351
##
## Wilcoxon rank sum exact test
##
## data: patho_df2$residuals and pten_gnomad$residuals
## W = 82, p-value = 0.006283
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df3$residuals, pten_gnomad$residuals) #9.383e-05
##
## Wilcoxon rank sum exact test
##
## data: patho_df3$residuals and pten_gnomad$residuals
## W = 299, p-value = 3.013e-05
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df2$residuals) #0.1693
##
## Wilcoxon rank sum exact test
##
## data: patho_df1$residuals and patho_df2$residuals
## W = 93, p-value = 0.02996
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df3$residuals) #0.02058
##
## Wilcoxon rank sum exact test
##
## data: patho_df1$residuals and patho_df3$residuals
## W = 336, p-value = 0.003475
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, patho_df3$residuals) #0.3959
##
## Wilcoxon rank sum exact test
##
## data: patho_df2$residuals and patho_df3$residuals
## W = 202, p-value = 0.5725
## alternative hypothesis: true location shift is not equal to 0
nrow(pten_gnomad) #37
## [1] 37
nrow(pten_patho) #115
## [1] 115
nrow(pten_somatic) #37
## [1] 37
pten_gnomad$var_source <- "gnomAD"
pten_patho$var_source <- "Pathogenic"
pten_somatic$var_source <- "Somatic"
pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
pten_patho <- pten_patho %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
pten_somatic <- pten_somatic %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
combined_df <- rbind(pten_gnomad, pten_patho, pten_somatic)
nrow(combined_df)
## [1] 189
table(combined_df$var_source)
##
## gnomAD Pathogenic Somatic
## 37 115 37
pten_df_clean_2plot <- combined_df
median_df <- pten_df_clean_2plot %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 3 × 3
## var_source median_residual n
## <chr> <dbl> <int>
## 1 Pathogenic -1.48 115
## 2 Somatic -1.52 37
## 3 gnomAD -0.00905 37
pten_df_clean_2plot$var_source <- factor(
pten_df_clean_2plot$var_source,
levels = c("gnomAD", "Somatic", "Pathogenic")
)
table(pten_df_clean_2plot$var_source)
##
## gnomAD Somatic Pathogenic
## 37 37 115
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green (unchanged)
"Somatic" = "#fbb4ae", # Soft sky blue
"Pathogenic" = "#de425b" # Deep ocean blue
)
p11 <- ggplot(pten_df_clean_2plot, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "All 173 Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) + theme_classic()+
theme(
legend.position = "none"
) +
geom_signif(
comparisons = list(c("gnomAD", "Somatic"),
c("gnomAD", "Pathogenic"),
c("Somatic", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)
p11

pten_df_clean_2plot <- pten_df_clean_2plot %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
pten_df_clean_2plot_int <- pten_df_clean_2plot %>% filter(mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_int) #89
## [1] 89
length(unique(pten_df_clean_2plot_int$mutant)) #89
## [1] 89
median_df <- pten_df_clean_2plot_int %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 3 × 3
## var_source median_residual n
## <fct> <dbl> <int>
## 1 gnomAD -0.0190 14
## 2 Somatic -2.38 15
## 3 Pathogenic -1.77 60
table(pten_df_clean_2plot_int$var_source)
##
## gnomAD Somatic Pathogenic
## 14 15 60
highlight_muts <- c("C124S", "G129E", "R130G", "R130Q", "P38S", "R130P", "D92H")
nrow(pten_df_clean_2plot_int)
## [1] 89
p12 <- ggplot(pten_df_clean_2plot_int, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
geom_point(
data = subset(pten_df_clean_2plot_int, mutant %in% highlight_muts),
aes(x = var_source, y = residuals),
shape = 17,
size = 2,
fill = "black",
color = "darkcyan",
stroke = 1
) +
geom_text_repel(data = subset(pten_df_clean_2plot_int, mutant %in% highlight_muts),
aes(label=mutant),color = "darkcyan", size = 3)+
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 4
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "89 Orthosteric Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "Somatic"),
c("gnomAD", "Pathogenic"),
c("Somatic", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
) + ylim(-8, 11)
p12

pten_df_clean_2plot_out <- pten_df_clean_2plot %>% filter(!mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_out) #100
## [1] 100
median_df <- pten_df_clean_2plot_out %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 3 × 3
## var_source median_residual n
## <fct> <dbl> <int>
## 1 gnomAD 0.0154 23
## 2 Somatic -1.33 22
## 3 Pathogenic -1.01 55
p13 <- ggplot(pten_df_clean_2plot_out, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
geom_point(
data = subset(pten_df_clean_2plot_out, mutant %in% highlight_muts),
aes(x = var_source, y = residuals),
shape = 17,
size = 2,
fill = "black",
color = "darkcyan",
stroke = 1
) +
geom_text_repel(data = subset(pten_df_clean_2plot_out, mutant %in% highlight_muts),
aes(label=mutant),color = "darkcyan",size = 3)+
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 4
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "100 Non-orthosteric Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "Somatic"),
c("gnomAD", "Pathogenic"),
c("Somatic", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
) + ylim(-8, 11)
p13

pten_df_clean_2plot_stable <- pten_df_clean_2plot %>% filter(DMS_score_abundance > 0.71)
nrow(pten_df_clean_2plot_stable) #81
## [1] 81
table(pten_df_clean_2plot_stable$var_source)
##
## gnomAD Somatic Pathogenic
## 28 14 39
pten_df_clean_2plot_stable_int <- pten_df_clean_2plot_stable %>% filter(mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_stable_int) #48
## [1] 48
median_df <- pten_df_clean_2plot_stable_int %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 3 × 3
## var_source median_residual n
## <fct> <dbl> <int>
## 1 gnomAD -0.222 9
## 2 Somatic -2.12 8
## 3 Pathogenic -2.41 31
p14 <- ggplot(pten_df_clean_2plot_stable_int, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
labs(
title = "48 WT-abundance Orthosteric Variants",
x = "",
y = "LOESS fit residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "Somatic"),
c("gnomAD", "Pathogenic"),
c("Somatic", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
) + ylim(-8, 11)
p14
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_violin()`).

pten_df_clean_2plot_stable_out <- pten_df_clean_2plot_stable %>% filter(!mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_stable_out) #33
## [1] 33
median_df <- pten_df_clean_2plot_stable_out %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 3 × 3
## var_source median_residual n
## <fct> <dbl> <int>
## 1 gnomAD -0.0127 19
## 2 Somatic -1.34 6
## 3 Pathogenic -1.56 8
p15 <- ggplot(pten_df_clean_2plot_stable_out, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 4
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "33 WT-abundance Non-orthosteric Variants",
x = "",
y = "LOESS fit residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "Somatic"),
c("gnomAD", "Pathogenic"),
c("Somatic", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
) + ylim(-8, 11)
p15

range(pten_df_clean_2plot$residuals)
## [1] -5.157748 2.422716
wilcox.test(pten_df_clean_2plot_stable_int %>% filter(var_source == "gnomAD") %>% pull(residuals),
pten_df_clean_2plot_stable_int %>% filter(var_source == "Pathogenic") %>% pull(residuals))
##
## Wilcoxon rank sum exact test
##
## data: pten_df_clean_2plot_stable_int %>% filter(var_source == "gnomAD") %>% pull(residuals) and pten_df_clean_2plot_stable_int %>% filter(var_source == "Pathogenic") %>% pull(residuals)
## W = 224, p-value = 0.004996
## alternative hypothesis: true location shift is not equal to 0
p16 <- plot_grid(p12,p13,ncol=2,nrow=1)
p16

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin3.pdf",
plot = p16, width = 5, height = 3, dpi = 300)
p17 <- plot_grid(p14,p15,ncol=2,nrow=1)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_violin()`).
p17

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin4.pdf",
plot = p17, width = 6, height = 4, dpi = 300)
pdb <- read.pdb("~/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r.pdb")
# Extract C-alpha atoms from protein (exclude ligand)
protein_ca <- pdb$atom[pdb$atom$elety == "CA" & pdb$atom$resid != "TLA", ]
# Extract ligand atoms (TLA residue)
ligand_atoms <- pdb$atom[pdb$atom$resid == "TLA" & pdb$atom$type == "HETATM", ]
# Calculate min distance from each C-alpha to the ligand
protein_ca$min_dist_to_ligand <- apply(protein_ca, 1, function(atom) {
dists <- sqrt((as.numeric(atom[["x"]]) - as.numeric(ligand_atoms$x))^2 +
(as.numeric(atom[["y"]]) - as.numeric(ligand_atoms$y))^2 +
(as.numeric(atom[["z"]]) - as.numeric(ligand_atoms$z))^2)
return(min(dists))
})
# View summary
nrow(protein_ca) #307
## [1] 307
summary(protein_ca$min_dist_to_ligand)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.353 14.371 19.765 19.785 25.267 37.348
head(protein_ca$min_dist_to_ligand)
## [1] 18.59907 17.19423 14.38261 15.13317 17.71047 20.54182
length(protein_ca$min_dist_to_ligand) #307
## [1] 307
fil_protein_ca <- protein_ca %>% dplyr::select(resid, resno, min_dist_to_ligand)
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
nrow(merged_df)
## [1] 4839
head(merged_df)
## mutation_position mutant DMS_score_abundance DMS_score_bin_abundance
## 1 2 T2A 0.7755378 1
## 2 2 T2D 1.1906667 1
## 3 2 T2K 0.9446276 1
## 4 2 T2N 1.1539191 1
## 5 2 T2P 0.8433099 1
## 6 2 T2R 0.9024487 1
## ESM.1v DMS_score_activity DMS_score_bin_activity V1 Model
## 1 -0.0355402 2.275491838 1 20 ThermoMPNN
## 2 -9.5293701 2.387178001 1 22 ThermoMPNN
## 3 -7.9349979 -0.009668751 1 28 ThermoMPNN
## 4 -7.4877127 -3.204991850 0 31 ThermoMPNN
## 5 -6.1493063 -0.257056982 1 32 ThermoMPNN
## 6 -9.1065905 0.551584254 1 34 ThermoMPNN
## Dataset ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 0.06013024 1 T A
## 2 AF-P60484-F1-model_v2 -0.01170939 1 T D
## 3 AF-P60484-F1-model_v2 0.19344133 1 T K
## 4 AF-P60484-F1-model_v2 0.06487387 1 T N
## 5 AF-P60484-F1-model_v2 0.16262442 1 T P
## 6 AF-P60484-F1-model_v2 0.20207280 1 T R
## pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 2 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 3 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 4 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 5 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 6 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## spot_disorder func_esms_residue_class func_esms_variant_class
## 1 1 1 0
## 2 1 1 1
## 3 1 1 0
## 4 1 1 0
## 5 1 1 0
## 6 1 1 1
## clinvar_clinical_significance
## 1
## 2
## 3
## 4
## 5
## 6
## sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## Phenotype_Class Phenotype_List Fitness_Score Abundance_Score HGVS.Consequence
## 1 <NA> <NA> NA NA <NA>
## 2 <NA> <NA> NA NA <NA>
## 3 <NA> <NA> NA NA <NA>
## 4 <NA> <NA> NA NA <NA>
## 5 <NA> <NA> NA NA <NA>
## 6 <NA> <NA> NA NA <NA>
## ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad fitted
## 1 <NA> <NA> NA <NA> -0.1669595
## 2 <NA> <NA> NA <NA> -0.1979372
## 3 <NA> <NA> NA <NA> -0.1787797
## 4 <NA> <NA> NA <NA> -0.1899017
## 5 <NA> <NA> NA <NA> -0.1742966
## 6 <NA> <NA> NA <NA> -0.1759730
## residuals somatic resid min_dist_to_ligand
## 1 2.44245136 <NA> <NA> NA
## 2 2.58511520 <NA> <NA> NA
## 3 0.16911091 <NA> <NA> NA
## 4 -3.01509015 <NA> <NA> NA
## 5 -0.08276039 <NA> <NA> NA
## 6 0.72755725 <NA> <NA> NA
merged_df <- merged_df %>% dplyr::select(-sequence)
nrow(merged_df)
## [1] 4839
merged_df <- merged_df %>% filter(residuals >= 0)
nrow(merged_df) #2449
## [1] 2390
all_positions <- c(
88:98, # WPD loop
160:171, # TI loop
123:130, # P loop
35:49, # Arginine loop
200:212, # CBR1
226:238, # CBR2
258:268, # CBR3
327:335, # Cα2 loop
151:174, # membrane-binding alpha-helix
1:15 # PBM motif
)
active_positions <- c(
88:98, # WPD loop
159:171, # TI loop
123:130, # P loop
35:49 # Arginine loop
)
binding_positions <- c(
200:212, # CBR1
226:238, # CBR2
258:268, # CBR3
327:335, # Cα2 loop
151:174, # membrane-binding alpha-helix
1:15 # PBM motif
)
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
merged_df <- merged_df %>% dplyr::select(-sequence)
nrow(merged_df) #4839
## [1] 4839
head(merged_df)
## mutation_position mutant DMS_score_abundance DMS_score_bin_abundance
## 1 2 T2A 0.7755378 1
## 2 2 T2D 1.1906667 1
## 3 2 T2K 0.9446276 1
## 4 2 T2N 1.1539191 1
## 5 2 T2P 0.8433099 1
## 6 2 T2R 0.9024487 1
## ESM.1v DMS_score_activity DMS_score_bin_activity V1 Model
## 1 -0.0355402 2.275491838 1 20 ThermoMPNN
## 2 -9.5293701 2.387178001 1 22 ThermoMPNN
## 3 -7.9349979 -0.009668751 1 28 ThermoMPNN
## 4 -7.4877127 -3.204991850 0 31 ThermoMPNN
## 5 -6.1493063 -0.257056982 1 32 ThermoMPNN
## 6 -9.1065905 0.551584254 1 34 ThermoMPNN
## Dataset ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 0.06013024 1 T A
## 2 AF-P60484-F1-model_v2 -0.01170939 1 T D
## 3 AF-P60484-F1-model_v2 0.19344133 1 T K
## 4 AF-P60484-F1-model_v2 0.06487387 1 T N
## 5 AF-P60484-F1-model_v2 0.16262442 1 T P
## 6 AF-P60484-F1-model_v2 0.20207280 1 T R
## pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 2 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 3 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 4 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 5 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## 6 AF-P60484-F1-model_v2 A 2 P60484 H 0.57
## spot_disorder func_esms_residue_class func_esms_variant_class
## 1 1 1 0
## 2 1 1 1
## 3 1 1 0
## 4 1 1 0
## 5 1 1 0
## 6 1 1 1
## clinvar_clinical_significance Phenotype_Class Phenotype_List Fitness_Score
## 1 <NA> <NA> NA
## 2 <NA> <NA> NA
## 3 <NA> <NA> NA
## 4 <NA> <NA> NA
## 5 <NA> <NA> NA
## 6 <NA> <NA> NA
## Abundance_Score HGVS.Consequence ClinVar.Germline.Classification rsIDs
## 1 NA <NA> <NA> <NA>
## 2 NA <NA> <NA> <NA>
## 3 NA <NA> <NA> <NA>
## 4 NA <NA> <NA> <NA>
## 5 NA <NA> <NA> <NA>
## 6 NA <NA> <NA> <NA>
## ClinVar.Variation.ID gnomad fitted residuals somatic resid
## 1 NA <NA> -0.1669595 2.44245136 <NA> <NA>
## 2 NA <NA> -0.1979372 2.58511520 <NA> <NA>
## 3 NA <NA> -0.1787797 0.16911091 <NA> <NA>
## 4 NA <NA> -0.1899017 -3.01509015 <NA> <NA>
## 5 NA <NA> -0.1742966 -0.08276039 <NA> <NA>
## 6 NA <NA> -0.1759730 0.72755725 <NA> <NA>
## min_dist_to_ligand
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
sum(!is.na(merged_df$min_dist_to_ligand)) #3738
## [1] 3738
merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
patho_df1 <- merged_df %>% filter (mutant %in% patho_df1$mutant)
nrow(patho_df1) #11
## [1] 11
patho_df2 <- merged_df %>% filter (mutant %in% patho_df2$mutant)
nrow(patho_df2) #11
## [1] 10
patho_df3 <- merged_df %>% filter (mutant %in% patho_df3$mutant)
nrow(patho_df3) #34
## [1] 33
pten_gnomad <- merged_df %>% filter (mutant %in% pten_gnomad$mutant)
nrow(pten_gnomad) #28
## [1] 28
pten_somatic <- merged_df %>% filter (mutant %in% pten_somatic$mutant)
nrow(pten_somatic) #37
## [1] 37
patho_df1$var_class2 <- "ASD/DD"
patho_df2$var_class2 <- "ASD/DD & PHTS"
patho_df3$var_class2 <- "PHTS"
pten_gnomad$var_class2 <- "gnomAD"
pten_somatic$var_class2 <- "somatic"
colnames(pten_gnomad)
## [1] "mutation_position" "mutant"
## [3] "DMS_score_abundance" "DMS_score_bin_abundance"
## [5] "ESM.1v" "DMS_score_activity"
## [7] "DMS_score_bin_activity" "V1"
## [9] "Model" "Dataset"
## [11] "ddG_pred" "position"
## [13] "wildtype" "mutation"
## [15] "pdb" "chain"
## [17] "new_position" "uniprot"
## [19] "exposure_SS" "exposure_rASA"
## [21] "spot_disorder" "func_esms_residue_class"
## [23] "func_esms_variant_class" "clinvar_clinical_significance"
## [25] "Phenotype_Class" "Phenotype_List"
## [27] "Fitness_Score" "Abundance_Score"
## [29] "HGVS.Consequence" "ClinVar.Germline.Classification"
## [31] "rsIDs" "ClinVar.Variation.ID"
## [33] "gnomad" "fitted"
## [35] "residuals" "somatic"
## [37] "resid" "min_dist_to_ligand"
## [39] "var_class2"
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2, residuals,min_dist_to_ligand)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
patho_df3 <- patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
pten_somatic <- pten_somatic %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
combined_df <- rbind(patho_df1,patho_df2,patho_df3,pten_gnomad, pten_somatic)
nrow(combined_df)
## [1] 119
median_df <- combined_df %>%
group_by(var_class2) %>%
summarise(
median_dist = median(min_dist_to_ligand, na.rm = TRUE),
n = n()
)
median_df
## # A tibble: 5 × 3
## var_class2 median_dist n
## <chr> <dbl> <int>
## 1 ASD/DD 17.6 11
## 2 ASD/DD & PHTS 13.6 10
## 3 PHTS 14.7 33
## 4 gnomAD 20.4 28
## 5 somatic 15.0 37
head(combined_df)
## mutant DMS_score_abundance DMS_score_activity var_class2 residuals
## 1 R14G 1.041911071 -0.7107802 ASD/DD -0.5318023
## 2 Q17E 0.971256096 -0.5467923 ASD/DD -0.3682351
## 3 H61Y 0.941901563 -0.5830217 ASD/DD -0.4042427
## 4 Y68C -0.008473303 -2.7323465 ASD/DD 0.4783478
## 5 P95T 0.155054169 -0.4500599 ASD/DD 1.9428391
## 6 L139F -0.126631454 -0.4326210 ASD/DD 3.3608216
## min_dist_to_ligand
## 1 18.59907
## 2 15.13317
## 3 24.22288
## 4 12.59897
## 5 7.55193
## 6 17.61638
combined_df$var_class2 <- factor(
combined_df$var_class2,
levels = c("gnomAD", "somatic", "ASD/DD", "ASD/DD & PHTS", "PHTS")
)
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green (unchanged)
"somatic" = "#fbb4ae", # Soft sky blue
"ASD/DD" = "#f4a6b3", # Warm orange
"ASD/DD & PHTS" = "#e7298a", # Muted purple
"PHTS" = "#7570b3" # Hot pink / magenta
)
active_positions <- c(
88:98, # WPD loop
159:171, # TI loop
35:49, # Arginine loop
123:130 # P loop
)
binding_positions <- c(
200:212, # CBR1
226:238, # CBR2
258:268, # CBR3
327:335, # Cα2 loop
151:174, # membrane-binding alpha-helix
1:15 # PBM motif
)
combined_df <- combined_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
combined_df$site_type <- "Non-orthosteric site"
combined_df$site_type[combined_df$mutation_position %in% active_positions] <- "Active site"
combined_df$site_type[combined_df$mutation_position %in% binding_positions] <- "Binding site"
table(combined_df$site_type)
##
## Active site Binding site Non-orthosteric site
## 26 22 71
p1 <- ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(aes(shape = site_type), width = 0.15, size = 2.5, alpha = 0.8,
fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_class2, y = median_dist + 1, label = sprintf("%.2f", median_dist)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors, guide = "none") +
scale_shape_manual(values = c(
"Non-orthosteric site" = 21,
"Active site" = 22,
"Binding site" = 23
)) +
geom_text(
data = median_df,
aes(x = var_class2, y = 40, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "119 Variants",
x = "",
y = "Distance to TLA",
fill = "",
shape = "Site Type"
) +
theme_classic() +
theme(
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "somatic"),
c("gnomAD", "ASD/DD"),
c("gnomAD", "ASD/DD & PHTS"),
c("gnomAD", "PHTS")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)
p1
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin_distance.pdf",
plot = p1, width = 6, height = 4, dpi = 300)
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
p1
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties

merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
merged_df <- merged_df %>% filter(residuals <= 0)
nrow(merged_df)
## [1] 2003
merged_df_residue <- merged_df %>%
group_by(mutation_position) %>%
summarise(
loess_residual_avg = median(residuals, na.rm = TRUE))
merged_df_residue <- merge(merged_df_residue, protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
merged_df_residue <- merged_df_residue[!is.na(merged_df_residue$min_dist_to_ligand), ]
nrow(merged_df_residue) #280
## [1] 280
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
## model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
## data: merged_df_residue
## a b
## 2.54954 0.04794
## residual sum-of-squares: 152.4
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 3.505e-06
# Nonlinear regression model
# model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
# data: merged_df_residue
# a b
# 2.54954 0.04794
# residual sum-of-squares: 152.4
#
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 3.505e-06
a <- 2.54954
b <- 0.04794
half_d <- log(2) / b
half_d #14.45864
## [1] 14.45864
y_cutoff <- a * exp(- b * half_d)
y_cutoff
## [1] 1.27477
x_vals <- seq(min(merged_df_residue$min_dist_to_ligand),
max(merged_df_residue$min_dist_to_ligand), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
fit <- try(nlsLM(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
min_dist_to_ligand = x_vals,
loess_residual_avg = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
nrow(merged_df_residue) #280
## [1] 280
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[abs(merged_df_residue$loess_residual_avg) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% binding_positions] <- "Binding site"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% active_positions] <- "Active site"
table(merged_df_residue$site_type)
##
## Active site Binding site Non-orthosteric site
## 45 52 62
## Null
## 121
max(merged_df_residue %>% filter (site_type == "Active site") %>% pull(min_dist_to_ligand)) #16.42677
## [1] 16.42677
# Plot with fitted curve
p23 <- ggplot(merged_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg))) +
# First, plot NULL points separately with alpha = 0.2
geom_point(data = subset(merged_df_residue, site_type == "Null"),
aes(color = site_type), size = 1, alpha = 0.7) +
# Then plot the rest
geom_point(data = subset(merged_df_residue, site_type != "Null"),
aes(color = site_type), size = 1) +
geom_text_repel(data = subset(merged_df_residue, mutation_position %in% c(123:130)) ,
aes(label = mutation_position, color = site_type), vjust = -0.5, size = 3) +
geom_text_repel(data = subset(merged_df_residue, mutation_position %in% c(1:14)) ,
aes(label = mutation_position, color = site_type), vjust = -0.5, size = 3) +
geom_text_repel(data = merged_df_residue %>% filter (loess_residual_avg <= y_cutoff) %>% filter (min_dist_to_ligand > 16.42677),
aes(label = mutation_position, color = site_type), vjust = 0.5, size = 3) +
geom_ribbon(data = fit_df_residue,
aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg)),
inherit.aes = FALSE, color = "black", size = 1) +
scale_color_manual(values = c(
"Null" = "grey",
"Non-orthosteric site" = "darkgreen",
"Binding site" = "cyan",
"Active site" = "orange"
)) +
theme_classic() +
geom_vline(xintercept = 16.42677, linetype = "dashed", color = "slategrey") +
geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
labs(
title = "PTEN: per-residue allosteric decay",
subtitle = "280 residues",
x = "Minimal distance to TLA",
y = "Median residual",
color = ""
) + theme(legend.position = "bottom") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 2.54954 \nb = - 0.04794",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Duplicated aesthetics after name standardisation: hjust
p23 <- ggMarginal(
p23,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_decay1.pdf",
plot = p23, width = 5, height = 5, dpi = 300)
## Warning: ggrepel: 142 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p23
## Warning: ggrepel: 140 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

lm_model <- lm(log(abs(loess_residual_avg)) ~ min_dist_to_ligand, data = merged_df_residue)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand,
## data = merged_df_residue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2854 -0.5895 0.0866 0.6599 1.5076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.405279 0.134745 3.008 0.00287 **
## min_dist_to_ligand -0.033466 0.006372 -5.252 2.99e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8119 on 278 degrees of freedom
## Multiple R-squared: 0.09027, Adjusted R-squared: 0.087
## F-statistic: 27.59 on 1 and 278 DF, p-value: 2.992e-07
# Call:
# lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand,
# data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.2854 -0.5895 0.0866 0.6599 1.5076
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.405279 0.134745 3.008 0.00287 **
# min_dist_to_ligand -0.033466 0.006372 -5.252 2.99e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.8119 on 278 degrees of freedom
# Multiple R-squared: 0.09027, Adjusted R-squared: 0.087
# F-statistic: 27.59 on 1 and 278 DF, p-value: 2.992e-07
merged_df <- merged_df %>% filter(residuals <= 0)
nrow(merged_df) #2003
## [1] 2003
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
data = merged_df,
start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
## model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
## data: merged_df
## a b
## 2.99859 0.04776
## residual sum-of-squares: 2231
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 9.093e-07
# Nonlinear regression model
# model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
# data: merged_df
# a b
# 2.99859 0.04776
# residual sum-of-squares: 2231
#
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.093e-07
a <- 2.99859
b <- 0.04776
half_d <- log(2) / b
half_d #14.51313
## [1] 14.51313
y_cutoff <- a * exp(-b * half_d)
y_cutoff
## [1] 1.499295
merged_df$site_type <- "Non-orthosteric site"
merged_df$site_type[abs(merged_df$residuals) <= y_cutoff] <- "Null"
merged_df$site_type[merged_df$mutation_position %in% active_positions] <- "Active site"
merged_df$site_type[merged_df$mutation_position %in% binding_positions] <- "Binding site"
x_vals <- seq(min(merged_df$min_dist_to_ligand),
max(merged_df$min_dist_to_ligand), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df[sample(nrow(merged_df), replace = TRUE), ]
fit <- try(nlsLM(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
min_dist_to_ligand = x_vals,
residuals = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
merged_df$pathogenic_status <- ifelse(
merged_df$clinvar_clinical_significance %in% c("pathogenic",
"likely_pathogenic"),
"Pathogenic", "Other"
)
# Plot
p21 <- ggplot(merged_df, aes(x = min_dist_to_ligand, y = abs(residuals))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = pathogenic_status, shape = pathogenic_status), size = 2) +
# Loess fit line
geom_ribbon(data = fit_df_residue,
aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(residuals)),
inherit.aes = FALSE, color = "black", size = 1) +
# Repelled text labels for selected mutations
geom_text_repel(
data = subset(merged_df, pathogenic_status == "Pathogenic" & min_dist_to_ligand > 16.42677 & residuals < y_cutoff & site_type == "Non-orthosteric site"),
aes(label = mutant,color = site_type),
size = 3,
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
# Manual color palette for site type
scale_color_manual(values = c(
"Null" = "grey",
"Non-orthosteric site" = "darkgreen",
"Binding site" = "cyan",
"Active site" = "orange"
)) +
# Transparency scale
scale_alpha_manual(values = c("Other" = 0.1, "Pathogenic" = 1)) +
# Reference lines
geom_vline(xintercept = 16.42677, linetype = "dashed", color = "slategrey") +
geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
theme_classic() +
labs(
title = "Per-mutation Allosteric Decay",
subtitle = "2003 mutations with positive residuals",
x = "Minimal distance to active sites (TLA)",
y = "LOESS residual",
color = ""
) + theme(legend.position = "bottom") + guides(color = "none", alpha = "none") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 2.99859 \nb = - 0.04776",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p21 <- ggMarginal(
p21,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_decay2.pdf",
plot = p21, width = 5, height = 5, dpi = 300)
p21

lm_model <- lm(log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3797 -0.6332 0.3234 0.9096 2.2610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.662759 0.074530 8.892 <2e-16 ***
## min_dist_to_ligand -0.051687 0.003745 -13.804 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.268 on 2001 degrees of freedom
## Multiple R-squared: 0.08694, Adjusted R-squared: 0.08649
## F-statistic: 190.5 on 1 and 2001 DF, p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
#
# Residuals:
# Min 1Q Median 3Q Max
# -7.3797 -0.6332 0.3234 0.9096 2.2610
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.662759 0.074530 8.892 <2e-16 ***
# min_dist_to_ligand -0.051687 0.003745 -13.804 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 1.268 on 2001 degrees of freedom
# Multiple R-squared: 0.08694, Adjusted R-squared: 0.08649
# F-statistic: 190.5 on 1 and 2001 DF, p-value: < 2.2e-16
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
merged_df$pathogenic_status <- ifelse(
merged_df$clinvar_clinical_significance %in% c("pathogenic",
"likely_pathogenic"),
"Pathogenic", "Other"
)
nrow(merged_df)
## [1] 3738
merged_df_int <- merged_df %>% filter(mutation_position %in% all_positions)
nrow(merged_df_int) #1279
## [1] 1279
table(merged_df_int$pathogenic_status)
##
## Other Pathogenic
## 1224 55
# Other Pathogenic
# 1224 55
merged_df_out <- merged_df %>% filter(!mutation_position %in% all_positions)
nrow(merged_df_out) #2459
## [1] 2459
table(merged_df_out$pathogenic_status)
##
## Other Pathogenic
## 2404 55
# Other Pathogenic
# 2404 55
# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.05
n_patho <- nrow(merged_df_int %>% filter(pathogenic_status == "Pathogenic"))
# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_int %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_int %>% filter(pathogenic_status != "Pathogenic") %>%
filter(!(mutant %in% pten_patho$mutant)) %>%
filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE) # Mean across matches per bootstrap
)
# Combine with patho residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
)
label_df <- label_df %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 4.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residuals",
title = "Orthosteric variants"
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
p_fast

# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.05
n_patho <- nrow(merged_df_out %>% filter(pathogenic_status == "Pathogenic"))
# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_out %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_out %>% filter(pathogenic_status != "Pathogenic") %>%
filter(!(mutant %in% pten_patho$mutant)) %>%
filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE) # Mean across matches per bootstrap
)
# Combine with patho residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
)
label_df <- label_df %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast2 <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 4.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residual",
title = "Non-orthosteric variants"
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
p_fast2

fig4G <- plot_grid(p_fast, p_fast2, nrow = 2, ncol=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin5.pdf",
plot = fig4G, width = 3, height = 6, dpi = 300)
fig4G
